##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Loading required package: bitops
ggplot2는 자주 쓰이고 유명한 r visualization package 중 하나이다. Layer 개념으로 사용하기 쉽고, 다양한 테마들과 옵션값들을 수정함으로 자신이 원하는 모양의 그래프를 빠르게 그려볼 수 있다.
먼저 다음 코드를 입력하여 package를 설치하자.
install.packages("ggplot2")
library(ggplot2)
head(simpledat)
## A1 A2 A3
## B1 10 7 12
## B2 9 11 6
barplot(simpledat, beside=TRUE)
plot(simpledat[1,], type='l')
lines(simpledat[2,], type='l', col='blue')
head(simpledat_long)
## Aval Bval value
## 1 A1 B1 10
## 2 A1 B2 9
## 3 A2 B1 7
## 4 A2 B2 11
## 5 A3 B1 12
## 6 A3 B2 6
x = Aval
fill = Bval
y = value
그래프 = geom_bar()
ggplot(simpledat_long, aes(x=Aval, y=value, fill=Bval)) + geom_bar(stat="identity", position="dodge")
ggplot(simpledat_long, aes(x=Aval, y=value, colour=Bval, group=Bval)) + geom_line()
ggplot에는 Defaults + Layers + Scales + Coordinate System의 형태로 적용됨
data visualization을 할 때 가장 중요한 것은 먼저 코딩을 하는 것이 아니라 데이터 셋을 파악하고 생각해 보는 것이다. 어떤 변수들이 무슨 특성을 가지고 있는지 혹은 크기나 규모는 얼마나 되는지 파악한 후 데이터를 어떻게 나타낼 것이지 먼저 손그림을 그려보는 것도 좋다. 지금은 간단하게 R에서 제공하는 데이터 셋을 기반으로 설명하도록 한다.
다양한 R에서 제공하는 데이터 셋 중 ChickWeight라는 데이터셋이 있다. 잠깐 살펴보면 다음과 같다.
## [1] 578 4
## Classes 'nfnGroupedData', 'nfGroupedData', 'groupedData' and 'data.frame': 578 obs. of 4 variables:
## $ weight: num 42 51 59 64 76 93 106 125 149 171 ...
## $ Time : num 0 2 4 6 8 10 12 14 16 18 ...
## $ Chick : Ord.factor w/ 50 levels "18"<"16"<"15"<..: 15 15 15 15 15 15 15 15 15 15 ...
## $ Diet : Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "formula")=Class 'formula' length 3 weight ~ Time | Chick
## .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv>
## - attr(*, "outer")=Class 'formula' length 2 ~Diet
## .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv>
## - attr(*, "labels")=List of 2
## ..$ x: chr "Time"
## ..$ y: chr "Body weight"
## - attr(*, "units")=List of 2
## ..$ x: chr "(days)"
## ..$ y: chr "(gm)"
처음 보는 데이터셋이기 때문에 이해가 잘 가지 않는다면 다음과 같이 입력하여 어떤 데이터셋인지 패널에 나오는 document를 읽어본다.
??ChickWeight
따라해봅시다.
chickLine <- ggplot(ChickWeight, aes(x=Time, y=weight, colour=Diet, group=Chick))
chickLine + geom_line()
chickSmooth <- ggplot(ChickWeight, aes(x=Time, y=weight, colour=Diet))
chickSmooth + geom_smooth(alpha=.4, size=3)
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
chick2Layer <- ggplot(ChickWeight, aes(x=Time, y=weight, colour=Diet))
chick2Layer + geom_point(alpha=.3) + geom_smooth(alpha=.2, size=1)
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
chickDensity <- ggplot(subset(ChickWeight, Time == 21), aes(x=weight, colour=Diet))
chickDensity + geom_density()
chickFacet <- ggplot(subset(ChickWeight, Time==21), aes(x=weight, fill=Diet))
chickFacet + geom_histogram(colour="black", binwidth = 50) + facet_grid(Diet~.)
따라치면서 공통점을 파악할 수 있었나요? 간단한 syntax를 설명하자면 다음과 같다.
ggplot(dataset, aes(x, y)) + geom_line()
## [1] 478 6
## 'data.frame': 478 obs. of 6 variables:
## $ date : Date, format: "1967-06-30" "1967-07-31" ...
## $ pce : num 508 511 517 513 518 ...
## $ pop : int 198712 198911 199113 199311 199498 199657 199808 199920 200056 200208 ...
## $ psavert : num 9.8 9.8 9 9.8 9.7 9.4 9 9.5 8.9 9.6 ...
## $ uempmed : num 4.5 4.7 4.6 4.9 4.7 4.8 5.1 4.5 4.1 4.6 ...
## $ unemploy: int 2944 2945 2958 3143 3066 3018 2878 3001 2877 2709 ...
## date pce pop psavert uempmed unemploy
## 1 1967-06-30 507.8 198712 9.8 4.5 2944
## 2 1967-07-31 510.9 198911 9.8 4.7 2945
## 3 1967-08-31 516.7 199113 9.0 4.6 2958
## 4 1967-09-30 513.3 199311 9.8 4.9 3143
## 5 1967-10-31 518.5 199498 9.7 4.7 3066
## 6 1967-11-30 526.2 199657 9.4 4.8 3018
ecoLine <- ggplot(economics, aes(x=date, y=unemploy))
ecoLine + geom_line()
ecoLine <- ggplot(economics, aes(x=date, y=unemploy))
ecoLine + geom_line(colour="darkgreen")
ecoLine <- ggplot(economics, aes(x=date, y=unemploy))
ecoLine + geom_line(linetype=2)
ecoLine <- ggplot(economics, aes(x=date, y=unemploy))
ecoLine + geom_line(linetype="dotdash")
## [1] 32 11
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
mtcarsPoint <- ggplot(mtcars, aes(x=wt, y=mpg, colour=hp))
mtcarsPoint + geom_point() + coord_cartesian(ylim=c(0,40))
mtcarsPoint <- ggplot(mtcars, aes(x=wt, y=mpg, colour=hp))
mtcarsPoint + geom_point() + scale_colour_continuous(breaks=c(100,300))
mtcarsPoint <- ggplot(mtcars, aes(x=wt, y=mpg, colour=hp))
mtcarsPoint + geom_point() + guides(colour="colourbar")
mtcarsPointPlot <- ggplot(mtcars, aes(x=wt, y=mpg, colour=hp)) + geom_point()
mtcarsSubset <- mtcars[1:10,]
mtcarsPointPlot %+% mtcarsSubset
mtcarsPoint <- ggplot(mtcars, aes(x=wt, y=mpg, colour=hp))
mtcarsPoint + geom_point(aes(colour=factor(cyl)), size=4)
mtcarsPoint <- ggplot(mtcars, aes(x=wt, y=mpg))
mtcarsPoint + geom_point(aes(colour="darkred"), size=4)
mtcarsPoint <- ggplot(mtcars, aes(x=wt, y=mpg, colour=hp))
mtcarsPoint + geom_point(aes(shape=factor(cyl)), size=4)
mtcarsPoint <- ggplot(mtcars, aes(x=wt, y=mpg))
mtcarsPoint + geom_point(shape=5, size=4)
shapeType <- data.frame(x=1:5, y=1:25, z=1:25)
shapeTypePlot <- ggplot(shapeType, aes(x=x, y=y))
shapeTypePlot + geom_point(aes(shape=z), size=4) + scale_shape_identity()
mtcarsPoint <- ggplot(mtcars, aes(x=wt, y=mpg))
mtcarsPoint + geom_point(shape="k", size=3)
mtcarsPoint <- ggplot(mtcars, aes(x=wt, y=mpg))
mtcarsPoint + geom_point(aes(size=qsec))
mtcarsBar <- ggplot(mtcars, aes(x=factor(cyl)))
mtcarsBar + geom_bar()
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
mtcarsHistogram <- ggplot(mtcars, aes(x=drat))
mtcarsHistogram + geom_histogram()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
mtcarsBar <- ggplot(mtcars, aes(x=factor(cyl)))
mtcarsBar + geom_bar(fill="red")
mtcarsBar <- ggplot(mtcars, aes(x=factor(cyl)))
mtcarsBar + geom_bar(colour="red")
mtcarsBar <- ggplot(mtcars, aes(x=factor(cyl)))
mtcarsBar + geom_bar(colour="red", fill="white")
mtcarsPoint <- ggplot(mtcars, aes(x=wt, y=mpg))
mtcarsPoint + geom_point(size=2.5) + geom_hline(yintercept=25, size=3.5)
mtcarsPoint <- ggplot(mtcars, aes(x=wt, y=mpg)) + geom_point()
mtcarsPoint + annotate("rect", xmin=2, xmax=3.5, ymin=20, ymax=25, fill="red", alpha=.3)
mtcarsPlot <- ggplot(mtcars, aes(x=wt, y=mpg)) + geom_point() + geom_smooth()
mtcarsPlot
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
mtcarsPlot<- qplot(disp,wt, data=mtcars) + geom_smooth()
mtcarsPlot + scale_x_continuous(limits=c(325,500))
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## Warning: Removed 24 rows containing missing values (stat_smooth).
## Warning: Removed 24 rows containing missing values (geom_point).
## [1] 58788 24
## 'data.frame': 58788 obs. of 24 variables:
## $ title : chr "$" "$1000 a Touchdown" "$21 a Day Once a Month" "$40,000" ...
## $ year : int 1971 1939 1941 1996 1975 2000 2002 2002 1987 1917 ...
## $ length : int 121 71 7 70 71 91 93 25 97 61 ...
## $ budget : int NA NA NA NA NA NA NA NA NA NA ...
## $ rating : num 6.4 6 8.2 8.2 3.4 4.3 5.3 6.7 6.6 6 ...
## $ votes : int 348 20 5 6 17 45 200 24 18 51 ...
## $ r1 : num 4.5 0 0 14.5 24.5 4.5 4.5 4.5 4.5 4.5 ...
## $ r2 : num 4.5 14.5 0 0 4.5 4.5 0 4.5 4.5 0 ...
## $ r3 : num 4.5 4.5 0 0 0 4.5 4.5 4.5 4.5 4.5 ...
## $ r4 : num 4.5 24.5 0 0 14.5 14.5 4.5 4.5 0 4.5 ...
## $ r5 : num 14.5 14.5 0 0 14.5 14.5 24.5 4.5 0 4.5 ...
## $ r6 : num 24.5 14.5 24.5 0 4.5 14.5 24.5 14.5 0 44.5 ...
## $ r7 : num 24.5 14.5 0 0 0 4.5 14.5 14.5 34.5 14.5 ...
## $ r8 : num 14.5 4.5 44.5 0 0 4.5 4.5 14.5 14.5 4.5 ...
## $ r9 : num 4.5 4.5 24.5 34.5 0 14.5 4.5 4.5 4.5 4.5 ...
## $ r10 : num 4.5 14.5 24.5 45.5 24.5 14.5 14.5 14.5 24.5 4.5 ...
## $ mpaa : Factor w/ 5 levels "","NC-17","PG",..: 1 1 1 1 1 1 5 1 1 1 ...
## $ Action : int 0 0 0 0 0 0 1 0 0 0 ...
## $ Animation : int 0 0 1 0 0 0 0 0 0 0 ...
## $ Comedy : int 1 1 0 1 0 0 0 0 0 0 ...
## $ Drama : int 1 0 0 0 0 1 1 0 1 0 ...
## $ Documentary: int 0 0 0 0 0 0 0 1 0 0 ...
## $ Romance : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Short : int 0 0 1 0 0 0 0 1 0 0 ...
## title year length budget rating votes r1 r2 r3
## 1 $ 1971 121 NA 6.4 348 4.5 4.5 4.5
## 2 $1000 a Touchdown 1939 71 NA 6.0 20 0.0 14.5 4.5
## 3 $21 a Day Once a Month 1941 7 NA 8.2 5 0.0 0.0 0.0
## 4 $40,000 1996 70 NA 8.2 6 14.5 0.0 0.0
## 5 $50,000 Climax Show, The 1975 71 NA 3.4 17 24.5 4.5 0.0
## 6 $pent 2000 91 NA 4.3 45 4.5 4.5 4.5
## r4 r5 r6 r7 r8 r9 r10 mpaa Action Animation Comedy Drama
## 1 4.5 14.5 24.5 24.5 14.5 4.5 4.5 0 0 1 1
## 2 24.5 14.5 14.5 14.5 4.5 4.5 14.5 0 0 1 0
## 3 0.0 0.0 24.5 0.0 44.5 24.5 24.5 0 1 0 0
## 4 0.0 0.0 0.0 0.0 0.0 34.5 45.5 0 0 1 0
## 5 14.5 14.5 4.5 0.0 0.0 0.0 24.5 0 0 0 0
## 6 14.5 14.5 14.5 4.5 4.5 14.5 14.5 0 0 0 1
## Documentary Romance Short
## 1 0 0 0
## 2 0 0 0
## 3 0 0 1
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
moviesHist <- ggplot(movies, aes(x=rating))
moviesHist + geom_histogram()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
moviesHist <- ggplot(movies, aes(x=rating))
moviesHist + geom_histogram(aes(fill=..count..))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## [1] 53940 10
## 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
## carat cut color clarity depth table price x y z
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
diaHist <- ggplot(diamonds, aes(carat, ..density..))
diaHist + geom_histogram(binwidth=0.2) + facet_grid(.~cut)
diaHist <- ggplot(diamonds, aes(clarity, fill=cut))
diaHist + geom_bar()
diaHist <- ggplot(diamonds, aes(clarity, fill=cut))
diaHist + geom_bar(aes(order=desc(cut)))
diaLm <- lm(price ~ cut, data=diamonds)
cuts <- data.frame(cut=unique(diamonds$cut), predict(diaLm, data.frame(cut=unique(diamonds$cut)), se=TRUE)[c("fit", "se.fit")])
head(cuts)
## cut fit se.fit
## 1 Ideal 3457.542 27.00121
## 2 Premium 4584.258 33.75352
## 3 Good 3928.864 56.59175
## 4 Very Good 3981.760 36.06181
## 5 Fair 4358.758 98.78795
diaRange <- ggplot(cuts, aes(x=cut, y=fit, ymin=fit-se.fit, ymax=fit+se.fit, colour=cut))
diaRange + geom_pointrange()
diaStat <- ggplot(diamonds, aes(x=carat, y=price))
diaStat + stat_bin2d(bins=25, colour="grey50")
diaBox <- ggplot(diamonds, aes(x=cut, y=price))
diaBox + geom_boxplot()
diaBox <- ggplot(diamonds, aes(x=cut, y=price))
diaBox + geom_boxplot() + coord_flip()
url <- "https://raw.githubusercontent.com/joshualog/DataDesigner/master/airPollution.csv"
airPollution <- getURL(url)
airPollution <- read.csv(textConnection(airPollution))
str(airPollution)
## 'data.frame': 2199 obs. of 9 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ date : num 2.02e+11 2.02e+11 2.02e+11 2.02e+11 2.02e+11 ...
## $ guName: Factor w/ 25 levels "강남구","강동구",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ no2 : num 0.043 0.046 0.04 0.026 0.021 0.018 0.021 0.037 0.046 0.038 ...
## $ o3 : num 0.02 0.021 0.014 0.022 0.022 0.025 0.023 0.012 0.006 0.015 ...
## $ co : num 0.4 0.4 0.4 0.3 0.3 0.3 0.3 0.4 0.4 0.4 ...
## $ so2 : num 0.006 0.008 0.008 0.008 0.007 0.006 0.007 0.006 0.007 0.007 ...
## $ pm10 : int 44 49 49 41 35 36 45 47 36 58 ...
## $ pm25 : int 19 17 28 27 18 25 20 27 17 27 ...
## Warning: Removed 79 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 8 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 28 rows containing missing values (geom_point).
## Warning: Removed 6 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 7 rows containing missing values (geom_point).
다 설명하긴 너무 많으므로 help를 이용한다.
??theme - ggplot2
ggthemes라는 패키지는 여러가지 테마를 제공하고 있다.library(ggthemes)
ecoLine <- ggplot(economics, aes(x=date, y=unemploy))
ecoLine + geom_line() + theme_economist()
기존에 많은 데이터를 원하는 모양으로 표현하기 위하여 excel이나 illustrator, 기존에 있던 통계 프로그램인 SAS, SPSS를 많이 사용해왔다. 하지만 SAS나 SPSS는 원하는 모양대로 customizing하기가 어려웠고, excel 또한 한계가 분명했다. illustrator는 많은 데이터를 일일히 손으로 그려주기엔 무리가 많았다. 그리하여 processing이 탄생하고 javascript에 d3.js, crossfilter.js 등 많은 라이브러리들이 있으나 프로그래밍을 하기까지는 시간이 오래걸린다. R은 이런 부분에서 초보자도 쉽게 customizing이 가능하고 많은 양의 데이터를 쉽게 그려줄 수 있도록 다양한 패키지를 제공한다. 개인적인 추천서는 r graphics cookbook 으로 어떤 시각화 책보다 많은 index들을 가지고 있으며 이런 부분에 관심이 많다면 한번쯤 읽어보는 것이 좋다.
Copyright(c)2015 by Joshua. All Page content is property of Joshua